Preparations

Load libraries and functions

library("cluster")
library("dendextend")
## 
## ---------------------
## Welcome to dendextend version 1.7.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
source("functions.R")
## Loading required package: ggplot2

Corpus description and selection

Load data

# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/tokenized/boudams/", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_expanded_words.csv")
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)

Text lengths

nwords = colSums(data)
summary(nwords)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     298    2244    3539    5070    6774   18971
boxplot(nwords)
boxplot(nwords)$out

## 05_Ano_Leg-A_Ap_NA_Vie_Jacques  29_Wau_Leg-C_Co_Ev_Vie_Martin 
##                          17920                          14432 
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial 
##                          18971                          15255
head(sort(nwords), n = 15)
##          03_Ano_Leg-A_Ap_NA_Mar_Jean          62_Ano_Leg-N_NA_NA_NA_Index 
##                                  298                                  301 
##       61_Ano_Leg-B_NA_NA_NA_Jugement       30_Wau_Leg-C_Co_Ev_Tra_Martin2 
##                                  406                                  722 
##      08_Ano_Leg-A_Ap_NA_Vie_Philippe     59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 
##                                 1014                                 1293 
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur         32_Wau_Leg-C_Co_Ev_Vie_Brice 
##                                 1356                                 1385 
##    60_Ano_Leg-B_NA_NA_NA_Antechriste       54_Ano_Leg-C_Vi_NA_Vie_Pelagie 
##                                 1485                                 1506 
##      20_Ano_Leg-B_Ma_Fe_Vie_Felicite          11_Ano_Leg-A_Ap_NA_Vie_Marc 
##                                 1676                                 1820 
##         23_Ano_Leg-B_Ma_Ho_Vie_Sixte    53_Ano_Leg-C_Vi_NA_Vie_Marguerite 
##                                 1894                                 1935 
##       35_Wau_Leg-C_Co_Ev_Vie_Nicolas 
##                                 1960
toKeep = colnames(data)[nwords > 1000]

toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]

Transkribus raw data

3-grams from raw data

Load data

# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_raw_char3grams.csv")
data = read.csv("data/transkr_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 9, main = "Characters 3-grams from raw data (Transkr)")

somCAH = somCluster(d)
somplotRaw3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")

Transkribus expanded data

Load data

data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Forms from expanded data

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
WordsSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 9, main = "Expanded word forms (Transkr/Boudams/Pie)")

somCAH = somCluster(d)
somplotForms = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")

Affixes from expanded data

# Creating affixes database from all words
dataAffs = countAffixes(data)

Burrows + vector-length norm

d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 9, main = "Expanded affixes (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotAffixes = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")

Unstandardised function words from expanded data

Create function words list

#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 8, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFW = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words")

Transkribus with linguistic annotation

POS 3-grams

data = read.csv("data/transkr_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")

Lemmas

data = read.csv("data/transkr_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")

Function words from lemmas

# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")

Affixes + POS 3-gr + Function words (lemmas)

data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")

Plots

Analyses

#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)

gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)

Robustness

cahList = list(raw3grams = CAHRaw3gr, Forms = CAHForms, Affs = CAHAffs, FW = CAHFW, Lemmas = CAHLemmas, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, Global = CAHGlob)
compareHC(cahList, k = 9)
##           raw3grams     Forms      Affs        FW    Lemmas FunctLemm
## raw3grams 1.0000000 0.8135593 0.8305085 0.7966102 0.7288136 0.7288136
## Forms     0.8135593 1.0000000 0.6949153 0.7288136 0.6440678 0.6779661
## Affs      0.8644068 0.7118644 1.0000000 0.7796610 0.7457627 0.7118644
## FW        0.8135593 0.7288136 0.7627119 1.0000000 0.6949153 0.7627119
## Lemmas    0.7288136 0.6440678 0.7457627 0.7118644 1.0000000 0.6440678
## FunctLemm 0.7288136 0.6779661 0.6949153 0.7966102 0.6779661 1.0000000
## POS3gr    0.6779661 0.6779661 0.6440678 0.7627119 0.6440678 0.6610169
## Global    0.8305085 0.8135593 0.8135593 0.8135593 0.6610169 0.7966102
##              POS3gr    Global
## raw3grams 0.7288136 0.8135593
## Forms     0.6779661 0.7966102
## Affs      0.7288136 0.8474576
## FW        0.7627119 0.7627119
## Lemmas    0.6779661 0.6440678
## FunctLemm 0.7118644 0.7966102
## POS3gr    1.0000000 0.7627119
## Global    0.8135593 1.0000000